import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('prosperLoanData.csv')
print(df.describe())
print("*"*60)
print(df.info())
#Nice stats to plot vis a vis occupation or state
df.DebtToIncomeRatio.unique()
df.Term.unique()
#snaity check
df.head()
#Univariate Plot
sns.distplot(df['BorrowerRate'])
plt.title('BorrowerRate Distribution')
#Bi-variate box plot
plt.figure(figsize=[35,15])
base_color = sns.color_palette()[0]
ax = sns.boxplot(data=df, x='LoanStatus', y='LoanOriginalAmount', color=base_color)
ax.tick_params(labelsize=13)
ax.set_xlabel(xlabel='LoanStatus', fontsize=20)
ax.set_ylabel(ylabel='LoanOriginalAmount', fontsize=20)
plt.title('Loan Status Box LoanOriginalAmount Distribution Box Plot', fontsize=30)
# checking total listingkeys
df.ListingKey.count()
# checking unique values including ListingKey uniques
df.nunique();
# Checking how does the duplicate rows look like, also how may rows have duplicates?
df[df.ListingKey.duplicated(keep=False)].sort_values("ListingKey")
#seems there are completely(all columns) duplicate rows in the data set for example:
df[df['ListingKey'] == '0F563597161095613517437']
#There are 1698 such duplicate rows, which we can drop
duplicate_df = df[df.ListingKey.duplicated(keep=False)].sort_values("ListingKey").shape
duplicate_df
#dropping the duplicates
df.drop_duplicates(subset='ListingKey', keep='first', inplace=True)
#checking what all columns we have in the data set
df.columns
sns.set(font_scale=1)
base_color = sns.color_palette()[0]
sns.scatterplot(x='BorrowerRate', y='LenderYield', data=df, color=base_color)
plt.title('BorrowerRate and LenderYield Correlation Scatter Plot', fontsize=15)
# Checking the strong correlation as indicated by above scatter plot
df['BorrowerRate'].corr(df['LenderYield'])
# Which are the top borrowing professions
plt.figure(figsize=[20, 5])
df_profession_loan = df.groupby(['Occupation'])['LoanOriginalAmount'].sum().sort_values(ascending=False).reset_index(name='borrowed_sum')
base_color = sns.color_palette()[0]
ax=sns.barplot(x='Occupation', y='borrowed_sum', data=df_profession_loan.head(10), color=base_color)
plt.title('Occupation vise Borrowed Sum Bar Plot', fontsize=15)
ax.set(ylabel='Total Borrowed Amount')
#Violin plot for top ten borrowing professions
sns.set(font_scale=3)
plt.figure(figsize=[50, 25])
base_color = sns.color_palette()[0]
sns.violinplot(x='Occupation' , y='LoanOriginalAmount', data=df[df['Occupation'].isin(['Other', 'Professional', 'Executive', 'Computer Programmer','Analyst', 'Sales - Commission', 'Accountant/CPA','Teacher','Nurse (RN)', 'Administrative Assistant'])], color=base_color)
plt.title('Occupation vise LoanOriginalAmount Violin Plot', fontsize=40)
Following group of columns have good postive correlation with each other: TotalCreditLinespast7years, CurrentCreditLines, OpenCreditLines, OpenRevolvingAccounts, OpenRevolvingMonthlyPayment
Following group of columns have good postive correlation with each other: BorrowerAPR, BorrowerRate, LenderYield, EstimatedEffectiveYield, EstimatedLoss, EstimatedReturn
TotalTrades is having good postive correlation with TotalCreditLinespast7years, CurrentCreditLines, OpenCreditLines, OpenRevolvingAccounts
# Multi variate plot
sns.set(font_scale=10)
select_var = ['TotalCreditLinespast7years', 'CurrentCreditLines', 'OpenCreditLines', 'OpenRevolvingAccounts', 'OpenRevolvingMonthlyPayment', 'BorrowerAPR', 'BorrowerRate', 'LenderYield', 'EstimatedEffectiveYield', 'EstimatedLoss', 'EstimatedReturn' ]
df_corr = df[select_var].copy()
plt.figure(figsize=[250, 250])
plt.yticks(fontsize=100)
ax=sns.heatmap(df_corr.corr(), vmin=0, vmax=0.5, cmap='coolwarm', square=True, annot=True, fmt='.2f', linewidth=.25)
ax.tick_params(labelbottom='on',labeltop='on')
plt.xticks(rotation=90)
plt.xticks(fontsize=100)
plt.title('Correlation Heat Map of all Columns', fontsize=300)
#Are there any data type issues in the data set?
print(df.shape)
print(df.dtypes)
# ListingCreationDate, ClosedDate, should be date time object
# What are all the Alpha unique ratings in the data set
df['ProsperRating (Alpha)'].unique()
#prosper rating plot; checking how may entries below to certain rating?
sns.set(font_scale=1)
df_alfa_rating = df.groupby(['ProsperRating (Alpha)']).size().reset_index(name='prosper_rating_counts')
ax=df_alfa_rating.plot.bar(x='ProsperRating (Alpha)')
plt.title('Prosperity Rating Count Bar Plot', fontsize=15)
ax.set(xlabel='Rating(Alpha)')
#Where the borrowers are from, checking using pandas groupby and bar plot?
df_state = df.groupby(['BorrowerState']).size().sort_values(ascending=False).reset_index(name='state_abbr_name')
df_state.plot.bar(x='BorrowerState', figsize=(10,4))
plt.title('Borrowers Count State vise Bar Plot', fontsize=15)
# More borrowers belong to which Employment status?
df.EmploymentStatus.unique()
df_employ_status = df.groupby(['EmploymentStatus']).size().sort_values(ascending=False).reset_index(name='employment_count')
df_employ_status.plot.bar(x='EmploymentStatus', figsize=(10,4))
plt.title('Borrowers from different Employment Status', fontsize=15)
# How many borrowers are homeowners and how many are not home owners?
df.IsBorrowerHomeowner.unique()
df_homeowner = df.groupby(['IsBorrowerHomeowner']).size().reset_index(name='homeowner_count')
df_homeowner.rename(index={0:'not_home_owner', 1:'home_owner'}, inplace=True)
ax = df_homeowner.plot.pie(y='homeowner_count', figsize=(5,5))
ax.legend(loc='upper left')
plt.title('House Owner vs. Non House Owner(Borrowers Count)', fontsize=15)
# Does the amount of loan vary significantly between home owners and non home owners?
df_homeowner_borrowed_sum = df.groupby(['IsBorrowerHomeowner'])['LoanOriginalAmount'].sum().sort_values(ascending=False).reset_index(name='borrowed_sum')
df_homeowner_borrowed_sum.rename(index={0:'not_home_owner', 1:'home_owner'}, inplace=True)
df_homeowner_borrowed_sum.plot.pie(y='borrowed_sum', figsize=(5,5))
plt.title('House Owner vs. Non House Owner(Borrowers Amount)', fontsize=15)
# Which states borrowed more money?
df_borrowed_sum = df.groupby(['BorrowerState'])['LoanOriginalAmount'].sum().sort_values(ascending=False).reset_index(name='borrowed_sum')
df_borrowed_sum.plot.bar(x='BorrowerState', figsize=(10,4))
plt.title('State vise sum of LoanOriginalAmount', fontsize=15)